1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

1.2 The Data

DARWIN <- read.csv("~/GitHub/FCA/Data/DARWIN/DARWIN.csv")
rownames(DARWIN) <- DARWIN$ID
DARWIN$ID <- NULL
DARWIN$class <- 1*(DARWIN$class=="P")
print(table(DARWIN$class))
#> 
#>  0  1 
#> 85 89

DARWIN[,1:ncol(DARWIN)] <- sapply(DARWIN,as.numeric)

signedlog <- function(x) { return (sign(x)*log(abs(1.0e12*x)+1.0))}
whof <- !(colnames(DARWIN) %in% c("class"));
DARWIN[,whof] <- signedlog(DARWIN[,whof])

1.2.0.1 Standarize the names for the reporting

studyName <- "DARWIN"
dataframe <- DARWIN
outcome <- "class"

TopVariables <- 10

thro <- 0.80
cexheat = 0.15

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
174 450
pander::pander(table(dataframe[,outcome]))
0 1
85 89

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1000

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]


dataframe <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data


if (!largeSet)
{
  
  hm <- heatMaps(data=dataframe,
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.9992136

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  Included: 450 , Uni p: 0.006350853 , Uncorrelated Base: 268 , Outcome-Driven Size: 0 , Base Size: 268 
#> 
#> 
 1 <R=0.999,w=  1,N=   82>, Top: 41( 1 )[ 1 : 41 : 0.975 ]( 41 , 41 , 0 ),<|>Tot Used: 82 , Added: 41 , Zero Std: 0 , Max Cor: 0.974
#> 
 2 <R=0.974,w=  1,N=   82>, Top: 18( 1 )[ 1 : 18 : 0.962 ]( 18 , 19 , 41 ),<|>Tot Used: 118 , Added: 19 , Zero Std: 0 , Max Cor: 0.960
#> 
 3 <R=0.960,w=  1,N=   82>, Top: 8( 1 )[ 1 : 8 : 0.955 ]( 8 , 8 , 58 ),<|>Tot Used: 134 , Added: 8 , Zero Std: 0 , Max Cor: 0.955
#> 
 4 <R=0.955,w=  2,N=   43>, Top: 21( 1 )[ 1 : 21 : 0.927 ]( 21 , 22 , 66 ),<|>Tot Used: 174 , Added: 22 , Zero Std: 0 , Max Cor: 0.927
#> 
 5 <R=0.927,w=  2,N=   43>, Top: 9( 1 )[ 1 : 9 : 0.914 ]( 9 , 10 , 84 ),<|>Tot Used: 188 , Added: 10 , Zero Std: 0 , Max Cor: 0.912
#> 
 6 <R=0.912,w=  2,N=   43>, Top: 2( 1 )[ 1 : 2 : 0.906 ]( 2 , 2 , 89 ),<|>Tot Used: 191 , Added: 2 , Zero Std: 0 , Max Cor: 0.906
#> 
 7 <R=0.906,w=  3,N=  107>, Top: 49( 2 )[ 1 : 49 : 0.853 ]( 48 , 51 , 90 ),<|>Tot Used: 277 , Added: 51 , Zero Std: 0 , Max Cor: 0.913
#> 
 8 <R=0.913,w=  3,N=  107>, Top: 6( 1 )[ 1 : 6 : 0.856 ]( 6 , 6 , 130 ),<|>Tot Used: 281 , Added: 6 , Zero Std: 0 , Max Cor: 0.853
#> 
 9 <R=0.853,w=  4,N=   57>, Top: 29( 1 )[ 1 : 29 : 0.800 ]( 28 , 28 , 132 ),<|>Tot Used: 308 , Added: 28 , Zero Std: 0 , Max Cor: 0.929
#> 
 10 <R=0.929,w=  4,N=   57>, Top: 4( 1 )[ 1 : 4 : 0.814 ]( 4 , 4 , 143 ),<|>Tot Used: 308 , Added: 4 , Zero Std: 0 , Max Cor: 0.808
#> 
 11 <R=0.808,w=  5,N=    4>, Top: 2( 1 )[ 1 : 2 : 0.800 ]( 2 , 2 , 144 ),<|>Tot Used: 308 , Added: 2 , Zero Std: 0 , Max Cor: 0.797
#> 
 12 <R=0.000,w=  6,N=    0>
#> 
 [ 12 ], 0.7965355 Decor Dimension: 308 . Cor to Base: 163 , ABase: 131 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

489

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

333

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

4.9

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

4.58

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPSTM <- attr(DEdataframe,"UPSTM")
  
  gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
}

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe,
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after IDeA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.7965355

1.8 U-MAP Visualization of features

1.8.1 The UMAP based on LASSO on Raw Data

classes <- unique(dataframe[,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[,outcome],col=raincolors[dataframe[,outcome]+1])

1.8.2 The decorralted UMAP


datasetframe.umap = umap(scale(DEdataframe[,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[,outcome],col=raincolors[DEdataframe[,outcome]+1])

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : mean_jerk_in_air6 200 : disp_index12 300 : mean_speed_in_air17 400 : gmrt_on_paper23




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : La_mean_jerk_in_air6 200 : La_disp_index12 300 : La_mean_speed_in_air17 400 : gmrt_on_paper23

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
total_time23 0.767 0.909 -0.366 0.736 6.93e-05 0.863
total_time15 0.775 1.062 -0.442 0.572 4.78e-01 0.844
air_time23 0.599 0.766 -0.374 0.715 2.31e-02 0.844
air_time15 0.684 1.112 -0.506 0.669 7.09e-01 0.829
total_time17 0.806 1.082 -0.400 0.966 3.10e-02 0.824
paper_time23 0.690 1.106 -0.435 0.703 6.55e-01 0.814
air_time17 0.674 0.980 -0.378 0.863 8.86e-02 0.806
paper_time17 0.664 1.045 -0.413 0.929 1.79e-01 0.796
total_time6 0.680 1.069 -0.364 0.665 7.13e-01 0.790
air_time16 0.426 0.841 -0.414 0.650 8.51e-01 0.787


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]

theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]

pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
air_time23 0.5993 0.766 -0.37359 0.715 2.31e-02 0.844
air_time15 0.6835 1.112 -0.50588 0.669 7.09e-01 0.829
air_time17 0.6742 0.980 -0.37796 0.863 8.86e-02 0.806
air_time16 0.4258 0.841 -0.41386 0.650 8.51e-01 0.787
disp_index23 0.5808 0.924 -0.35306 0.816 3.70e-01 0.787
air_time6 0.5641 0.982 -0.41988 0.746 6.19e-01 0.784
air_time7 0.5315 0.829 -0.23828 0.882 7.92e-03 0.779
gmrt_in_air7 -0.4478 0.811 0.42274 0.794 9.97e-01 0.775
paper_time9 0.4679 0.890 -0.41181 0.710 7.56e-01 0.774
air_time2 0.3619 0.810 -0.44747 0.699 1.52e-01 0.773
La_total_time5 0.2557 0.509 0.00354 0.164 1.13e-07 0.730
La_mean_speed_on_paper13 -0.0359 0.115 0.02486 0.195 1.90e-05 0.728
La_mean_speed_on_paper2 -0.1135 0.389 0.08430 0.290 1.43e-08 0.716
La_disp_index21 -0.4089 0.624 -0.01813 0.304 8.89e-01 0.706
La_paper_time3 0.2542 0.406 0.00794 0.231 5.43e-01 0.693

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")

theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))


theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
2.11 172 0.382


allSigvars <- names(dc)



dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}

finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
air_time23 0.5993 0.766 -0.37359 0.715 2.31e-02 0.844 0.844 1
air_time15 0.6835 1.112 -0.50588 0.669 7.09e-01 0.829 0.829 1
air_time17 0.6742 0.980 -0.37796 0.863 8.86e-02 0.806 0.806 1
air_time16 0.4258 0.841 -0.41386 0.650 8.51e-01 0.787 0.787 1
disp_index23 0.5808 0.924 -0.35306 0.816 3.70e-01 0.787 0.787 1
air_time6 0.5641 0.982 -0.41988 0.746 6.19e-01 0.784 0.784 1
air_time7 0.5315 0.829 -0.23828 0.882 7.92e-03 0.779 0.779 1
gmrt_in_air7 -0.4478 0.811 0.42274 0.794 9.97e-01 0.775 0.775 1
paper_time9 0.4679 0.890 -0.41181 0.710 7.56e-01 0.774 0.774 2
air_time2 0.3619 0.810 -0.44747 0.699 1.52e-01 0.773 0.773 1
La_total_time5 -0.813paper_time5 + 1.000total_time5 0.2557 0.509 0.00354 0.164 1.13e-07 0.730 0.674 -1
La_mean_speed_on_paper13 -0.971gmrt_on_paper13 + 1.000mean_speed_on_paper13 -0.0359 0.115 0.02486 0.195 1.90e-05 0.728 0.626 -1
mean_speed_on_paper2 NA -0.3422 0.901 0.35546 0.928 4.91e-01 0.720 0.720 NA
La_mean_speed_on_paper2 -0.878gmrt_on_paper2 + 1.000mean_speed_on_paper2 -0.1135 0.389 0.08430 0.290 1.43e-08 0.716 0.720 -1
paper_time3 NA 0.3381 1.043 -0.40899 0.989 6.14e-01 0.715 0.715 NA
La_disp_index21 + 1.000disp_index21 -0.906paper_time21 -0.4089 0.624 -0.01813 0.304 8.89e-01 0.706 0.538 -1
La_paper_time3 -0.946disp_index3 + 1.000paper_time3 0.2542 0.406 0.00794 0.231 5.43e-01 0.693 0.715 -1
total_time5 NA 0.3576 1.238 -0.21141 0.766 6.87e-01 0.674 0.674 NA
disp_index3 NA 0.0887 1.012 -0.44082 1.088 4.68e-01 0.669 0.669 1
gmrt_on_paper2 NA -0.2603 0.961 0.30870 1.034 9.26e-01 0.663 0.663 2
paper_time5 NA 0.1254 1.429 -0.26454 0.926 8.36e-01 0.629 0.629 1
mean_speed_on_paper13 NA -0.3097 0.976 0.13242 0.759 9.69e-01 0.626 0.626 NA
gmrt_on_paper13 NA -0.2820 1.004 0.11081 0.765 9.59e-01 0.606 0.606 2
paper_time21 NA 0.1398 1.174 -0.08999 1.084 8.16e-01 0.542 0.542 NA
disp_index21 NA -0.2822 1.300 -0.09963 0.981 7.01e-02 0.538 0.538 NA